home *** CD-ROM | disk | FTP | other *** search
/ MacHack 2000 / MacHack 2000.toast / pc / The Hacks / MacHacksBug / Python 1.5.2c1 / Tools / webchecker / websucker.py < prev    next >
Encoding:
Python Source  |  2000-06-23  |  3.1 KB  |  119 lines

  1. #! /usr/bin/env python
  2.  
  3. """A variant on webchecker that creates a mirror copy of a remote site."""
  4.  
  5. __version__ = "$Revision: 1.6 $"
  6.  
  7. import os
  8. import sys
  9. import string
  10. import urllib
  11. import getopt
  12.  
  13. import webchecker
  14.  
  15. # Extract real version number if necessary
  16. if __version__[0] == '$':
  17.     _v = string.split(__version__)
  18.     if len(_v) == 3:
  19.         __version__ = _v[1]
  20.  
  21. def main():
  22.     verbose = webchecker.VERBOSE
  23.     try:
  24.         opts, args = getopt.getopt(sys.argv[1:], "qv")
  25.     except getopt.error, msg:
  26.         print msg
  27.         print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
  28.         return 2
  29.     for o, a in opts:
  30.         if o == "-q":
  31.             verbose = 0
  32.         if o == "-v":
  33.             verbose = verbose + 1
  34.     c = Sucker()
  35.     c.setflags(verbose=verbose)
  36.     c.urlopener.addheaders = [
  37.             ('User-agent', 'websucker/%s' % __version__),
  38.         ]
  39.     for arg in args:
  40.         print "Adding root", arg
  41.         c.addroot(arg)
  42.     print "Run..."
  43.     c.run()
  44.  
  45. class Sucker(webchecker.Checker):
  46.  
  47.     checkext = 0
  48.  
  49.     def readhtml(self, url):
  50.         text = None
  51.         path = self.savefilename(url)
  52.         try:
  53.             f = open(path, "rb")
  54.         except IOError:
  55.             f = self.openpage(url)
  56.             if f:
  57.                 info = f.info()
  58.                 nurl = f.geturl()
  59.                 if nurl != url:
  60.                     url = nurl
  61.                     path = self.savefilename(url)
  62.                 text = f.read()
  63.                 f.close()
  64.                 self.savefile(text, path)
  65.                 if not self.checkforhtml(info, url):
  66.                     text = None
  67.         else:
  68.             if self.checkforhtml({}, url):
  69.                 text = f.read()
  70.             f.close()
  71.         return text, url
  72.  
  73.     def savefile(self, text, path):
  74.         dir, base = os.path.split(path)
  75.         makedirs(dir)
  76.         try:
  77.             f = open(path, "wb")
  78.             f.write(text)
  79.             f.close()
  80.             self.message("saved %s", path)
  81.         except IOError, msg:
  82.             self.message("didn't save %s: %s", path, str(msg))
  83.  
  84.     def savefilename(self, url):
  85.         type, rest = urllib.splittype(url)
  86.         host, path = urllib.splithost(rest)
  87.         while path[:1] == "/": path = path[1:]
  88.         user, host = urllib.splituser(host)
  89.         host, port = urllib.splitnport(host)
  90.         host = string.lower(host)
  91.         if not path or path[-1] == "/":
  92.             path = path + "index.html"
  93.         if os.sep != "/":
  94.             path = string.join(string.split(path, "/"), os.sep)
  95.         path = os.path.join(host, path)
  96.         return path
  97.  
  98. def makedirs(dir):
  99.     if not dir:
  100.         return
  101.     if os.path.exists(dir):
  102.         if not os.path.isdir(dir):
  103.             try:
  104.                 os.rename(dir, dir + ".bak")
  105.                 os.mkdir(dir)
  106.                 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
  107.             except os.error:
  108.                 pass
  109.         return
  110.     head, tail = os.path.split(dir)
  111.     if not tail:
  112.         print "Huh?  Don't know how to make dir", dir
  113.         return
  114.     makedirs(head)
  115.     os.mkdir(dir, 0777)
  116.  
  117. if __name__ == '__main__':
  118.     sys.exit(main() or 0)
  119.